Data
dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\property_water_claims_non_cat_fs_v5.csv", header=TRUE)
Claim Partial Dependency XGB Classification dataset exported from a Python notebook
pd_dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\EDA_PartialDependency.csv", header=TRUE)
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.6.5 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(ggplot2)
colnames(dataset)
## [1] "modeldata_id"
## [2] "systemidstart"
## [3] "systemidend"
## [4] "cal_year"
## [5] "startdate"
## [6] "enddate"
## [7] "startdatetm"
## [8] "enddatetm"
## [9] "ecy"
## [10] "log_ecy"
## [11] "policynumber"
## [12] "policy_uniqueid"
## [13] "policyterm"
## [14] "policytype"
## [15] "effectivedate"
## [16] "expirationdate"
## [17] "policystate"
## [18] "policyform"
## [19] "persistency"
## [20] "companycd"
## [21] "carriercd"
## [22] "agency_group"
## [23] "producername"
## [24] "territory"
## [25] "risknumber"
## [26] "risktype"
## [27] "yearbuilt"
## [28] "log_yearbuilt"
## [29] "sqft"
## [30] "log_sqft"
## [31] "stories"
## [32] "roofcd"
## [33] "roofcd_encd"
## [34] "units"
## [35] "occupancycd"
## [36] "occupancy_encd"
## [37] "allperilded"
## [38] "waterded"
## [39] "protectionclass"
## [40] "constructioncd"
## [41] "constructioncd_encd"
## [42] "fire_risk_model_score"
## [43] "multipolicyind"
## [44] "multipolicyindumbrella"
## [45] "earthquakeumbrellaind"
## [46] "usagetype"
## [47] "usagetype_encd"
## [48] "ordinanceorlawpct"
## [49] "functionalreplacementcost"
## [50] "homegardcreditind"
## [51] "sprinklersystem"
## [52] "landlordind"
## [53] "rentersinsurance"
## [54] "firealarmtype"
## [55] "burglaryalarmtype"
## [56] "waterdetectiondevice"
## [57] "neighborhoodcrimewatchind"
## [58] "propertymanager"
## [59] "safeguardplusind"
## [60] "kitchenfireextinguisherind"
## [61] "gatedcommunityind"
## [62] "deadboltind"
## [63] "poolind"
## [64] "replacementcostdwellingind"
## [65] "replacementvalueind"
## [66] "serviceline"
## [67] "equipmentbreakdown"
## [68] "numberoffamilies"
## [69] "insuredage"
## [70] "maritalstatus"
## [71] "insurancescore"
## [72] "overriddeninsurancescore"
## [73] "insurancescorevalue"
## [74] "insscoretiervalueband"
## [75] "financialstabilitytier"
## [76] "allcov_wp"
## [77] "cova_wp"
## [78] "cova_ep"
## [79] "cova_deductible"
## [80] "log_cova_deductible"
## [81] "cova_limit"
## [82] "log_cova_limit"
## [83] "cova_ic_nc_water"
## [84] "hasclaim"
## [85] "cova_il_nc_water"
## [86] "log_cova_il_nc_water"
## [87] "water_risk_3_blk"
## [88] "log_water_risk_3_blk"
## [89] "water_risk_fre_3_blk"
## [90] "log_water_risk_fre_3_blk"
## [91] "water_risk_sev_3_blk"
## [92] "log_water_risk_sev_3_blk"
## [93] "appl_fail_3_blk"
## [94] "fixture_leak_3_blk"
## [95] "pipe_froze_3_blk"
## [96] "plumb_leak_3_blk"
## [97] "rep_cost_3_blk"
## [98] "ustructure_fail_3_blk"
## [99] "waterh_fail_3_blk"
## [100] "loaddate"
## [101] "customer_cnt_active_policies"
## [102] "customer_cnt_active_policies_binned"
#dataset <- dataset[,all]
str(dataset)
## 'data.frame': 2306865 obs. of 102 variables:
## $ modeldata_id : int 926880 369590 371058 366474 367033 430043 1059307 372727 373618 369334 ...
## $ systemidstart : int 2517514 786489 788544 782278 783043 962975 3070559 791048 792228 786104 ...
## $ systemidend : int 2517514 786489 788557 782278 949234 962975 3070559 1190404 1068944 786104 ...
## $ cal_year : int 2015 2012 2010 2011 2011 2011 2015 2012 2011 2011 ...
## $ startdate : Factor w/ 4135 levels "2009-01-01","2009-01-02",..: 2192 1096 628 731 882 997 2436 1096 1044 731 ...
## $ enddate : Factor w/ 4104 levels "2009-01-09 00:00:00",..: 2166 1190 698 754 1063 1063 2524 1262 1063 741 ...
## $ startdatetm : Factor w/ 22990 levels "2008-01-09 00:00:00",..: 11981 4408 3173 2108 4554 5397 13368 4904 5705 2038 ...
## $ enddatetm : Factor w/ 16232 levels "2009-01-09 00:00:00",..: 7522 3965 3045 2253 4065 4530 9644 4258 3939 2205 ...
## $ ecy : num 0.0191 0.3477 0.2819 0.1533 0.5859 ...
## $ log_ecy : num -3.958 -1.056 -1.266 -1.875 -0.535 ...
## $ policynumber : Factor w/ 243197 levels "AZF0082147","AZF0221975",..: 97 116 152 277 789 585 18 201 220 110 ...
## $ policy_uniqueid : int 823918 365559 366890 362938 363414 439016 931611 368459 369193 365318 ...
## $ policyterm : int 5 2 1 1 2 2 6 2 2 1 ...
## $ policytype : Factor w/ 2 levels "New","Renewal": 2 2 1 1 2 2 2 2 2 1 ...
## $ effectivedate : Factor w/ 4178 levels "2008-01-09","2008-01-14",..: 2206 900 671 465 925 1040 2479 972 1087 452 ...
## $ expirationdate : Factor w/ 4299 levels "2009-01-09","2009-01-14",..: 2327 1022 792 586 1047 1162 2601 1094 1209 573 ...
## $ policystate : Factor w/ 3 levels "AZ","CA","NV": 1 1 1 1 1 1 1 1 1 1 ...
## $ policyform : Factor w/ 9 levels "DF1","DF3","DF6",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ persistency : int 13 9 6 4 2 3 15 6 6 8 ...
## $ companycd : int 1 1 1 1 1 1 1 1 1 1 ...
## $ carriercd : Factor w/ 2 levels "CSEICO","CSESG": 1 1 1 1 1 1 1 1 1 1 ...
## $ agency_group : Factor w/ 605 levels "","1ST CENTURY INS SVCS INC.",..: 464 374 504 374 579 260 96 533 245 542 ...
## $ producername : Factor w/ 1267 levels "1ST CENTURY INS SVCS INC.",..: 1006 732 1074 732 1217 443 150 1143 423 1152 ...
## $ territory : Factor w/ 10 levels "","AZ-A","AZ-T",..: 2 2 6 2 3 2 6 2 2 2 ...
## $ risknumber : int 1 1 1 1 1 1 2 1 1 1 ...
## $ risktype : Factor w/ 2 levels "Dwelling","Homeowners": 1 1 1 1 1 1 1 1 1 1 ...
## $ yearbuilt : int 1977 1960 2001 1995 2005 1979 1994 2003 1983 1986 ...
## $ log_yearbuilt : num 7.59 7.58 7.6 7.6 7.6 ...
## $ sqft : int 1600 1200 1600 1500 1300 2100 1400 1500 4000 1300 ...
## $ log_sqft : num 7.41 7.09 7.4 7.37 7.17 ...
## $ stories : int 1 1 1 1 1 1 1 1 1 1 ...
## $ roofcd : Factor w/ 7 levels "COMPO","MEMBRANE",..: 1 1 6 1 6 1 1 1 4 1 ...
## $ roofcd_encd : int 8 8 7 8 7 8 8 8 6 8 ...
## $ units : int 1 1 1 1 1 1 1 1 4 1 ...
## $ occupancycd : Factor w/ 3 levels "NO","OCCUPIEDNOW",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ occupancy_encd : int 2 2 2 2 2 2 2 2 2 2 ...
## $ allperilded : int 250 500 500 500 2500 500 500 500 500 500 ...
## $ waterded : int 0 0 0 0 0 0 0 0 0 0 ...
## $ protectionclass : int 3 2 4 4 6 2 3 3 2 3 ...
## $ constructioncd : Factor w/ 5 levels "AF","B","F","M",..: 3 4 3 3 3 4 3 3 3 3 ...
## $ constructioncd_encd : int 5 1 5 5 5 1 5 5 5 5 ...
## $ fire_risk_model_score : int 0 0 0 0 0 0 0 0 0 0 ...
## $ multipolicyind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ multipolicyindumbrella : int 0 0 0 0 0 0 0 0 0 0 ...
## $ earthquakeumbrellaind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ usagetype : Factor w/ 7 levels "COC","PRIMARY",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ usagetype_encd : int 6 6 6 6 6 6 6 6 6 6 ...
## $ ordinanceorlawpct : int 10 10 10 10 10 10 10 10 10 10 ...
## $ functionalreplacementcost : int 0 0 0 0 0 0 0 0 0 0 ...
## $ homegardcreditind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sprinklersystem : int 0 0 0 0 0 0 0 0 0 0 ...
## $ landlordind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ rentersinsurance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ firealarmtype : int 0 0 0 0 0 0 0 0 0 0 ...
## $ burglaryalarmtype : int 0 0 0 0 0 0 0 0 0 0 ...
## $ waterdetectiondevice : int 0 0 0 0 0 0 0 0 0 0 ...
## $ neighborhoodcrimewatchind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ propertymanager : int 0 0 0 0 0 0 0 0 0 0 ...
## $ safeguardplusind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ kitchenfireextinguisherind : int 0 0 0 0 1 0 0 0 0 0 ...
## $ gatedcommunityind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ deadboltind : int 0 0 0 0 1 0 0 0 0 0 ...
## $ poolind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ replacementcostdwellingind : int 1 0 1 1 1 1 0 1 1 1 ...
## $ replacementvalueind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ serviceline : int 0 0 0 0 0 0 0 0 0 0 ...
## $ equipmentbreakdown : int 0 0 0 0 0 0 0 0 0 0 ...
## $ numberoffamilies : int 1 1 1 1 1 1 1 1 4 1 ...
## $ insuredage : int NA NA NA NA 52 66 NA NA NA NA ...
## $ maritalstatus : Factor w/ 5 levels "~","Divorced",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ insurancescore : Factor w/ 3836 levels "(DOES","~","610",..: 2 2 2 2 2 11 2 2 2 2 ...
## $ overriddeninsurancescore : Factor w/ 41 levels "~","01","02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ insurancescorevalue : Factor w/ 92 levels "~","630","645",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ insscoretiervalueband : Factor w/ 22 levels "~","624-632",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ financialstabilitytier : Factor w/ 42 levels "","~","01","02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ allcov_wp : num 564 446 460 494 422 ...
## $ cova_wp : num 471 418 376 437 364 ...
## $ cova_ep : num 9 145 106 67 213 ...
## $ cova_deductible : int 250 500 500 500 2500 500 500 500 500 500 ...
## $ log_cova_deductible : num 5.52 6.21 6.21 6.21 7.82 ...
## $ cova_limit : int 200000 200000 200000 200000 200000 300000 200000 300000 500000 200000 ...
## $ log_cova_limit : num 12.1 12.2 12 12.2 12.2 ...
## $ cova_ic_nc_water : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hasclaim : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cova_il_nc_water : num 0 0 0 0 0 0 0 0 0 0 ...
## $ log_cova_il_nc_water : num 0 0 0 0 0 0 0 0 0 0 ...
## $ water_risk_3_blk : int 168 201 222 233 266 215 191 184 172 243 ...
## $ log_water_risk_3_blk : num 5.12 5.3 5.4 5.45 5.58 ...
## $ water_risk_fre_3_blk : int 209 243 218 262 214 257 187 200 242 209 ...
## $ log_water_risk_fre_3_blk : num 5.34 5.49 5.38 5.57 5.37 ...
## $ water_risk_sev_3_blk : int 83 85 105 92 128 86 105 95 73 119 ...
## $ log_water_risk_sev_3_blk : num 4.42 4.44 4.65 4.52 4.85 ...
## $ appl_fail_3_blk : int 5 5 5 5 5 5 5 5 4 5 ...
## $ fixture_leak_3_blk : int 1 1 3 3 2 3 1 1 5 1 ...
## $ pipe_froze_3_blk : int 2 0 0 0 0 0 3 2 0 3 ...
## $ plumb_leak_3_blk : int 4 5 1 4 4 4 5 5 5 4 ...
## $ rep_cost_3_blk : int 1 5 5 4 5 4 0 1 4 0 ...
## $ ustructure_fail_3_blk : int 5 5 5 5 5 5 5 5 5 5 ...
## $ waterh_fail_3_blk : int 2 3 3 3 3 3 1 3 2 4 ...
## [list output truncated]
summary(dataset)
## modeldata_id systemidstart systemidend cal_year
## Min. : 1 Min. : 2 Min. : 3 Min. :2009
## 1st Qu.: 499011 1st Qu.:1131009 1st Qu.:1197320 1st Qu.:2012
## Median : 990532 Median :2753282 Median :2865110 Median :2015
## Mean :1008134 Mean :3472157 Mean :3576917 Mean :2015
## 3rd Qu.:1516564 3rd Qu.:5616055 3rd Qu.:5805417 3rd Qu.:2018
## Max. :2175195 Max. :9614887 Max. :9617520 Max. :2020
##
## startdate enddate
## 2019-01-01: 128258 2019-01-01 00:00:00: 128231
## 2018-01-01: 125314 2018-01-01 00:00:00: 125249
## 2020-01-01: 106826 2020-01-01 00:00:00: 106916
## 2017-01-01: 106306 2017-01-01 00:00:00: 106196
## 2013-01-01: 97750 2013-01-01 00:00:00: 97762
## 2014-01-01: 96383 2014-01-01 00:00:00: 96396
## (Other) :1646028 (Other) :1646115
## startdatetm enddatetm
## 2018-06-01 00:00:00: 1848 2019-06-01 00:00:00: 2207
## 2018-07-01 00:00:00: 1703 2018-06-01 00:00:00: 2110
## 2017-09-01 00:00:00: 1695 2019-07-01 00:00:00: 2088
## 2017-07-01 00:00:00: 1693 2018-07-01 00:00:00: 2044
## 2017-06-01 00:00:00: 1575 2019-08-01 00:00:00: 2002
## 2018-08-01 00:00:00: 1572 2018-09-01 00:00:00: 1977
## (Other) :2296779 (Other) :2294437
## ecy log_ecy policynumber
## Min. :0.0027 Min. :-5.914504 CAF0389924: 201
## 1st Qu.:0.2381 1st Qu.:-1.435065 CAF0461789: 184
## Median :0.4462 Median :-0.806988 CAF0475516: 154
## Mean :0.4672 Mean :-1.054233 CAF0393082: 151
## 3rd Qu.:0.6981 3rd Qu.:-0.359393 CAF0464778: 132
## Max. :1.0020 Max. : 0.001998 CAF0468201: 110
## (Other) :2305933
## policy_uniqueid policyterm policytype effectivedate
## Min. : 1 Min. : 1.000 New : 526336 2018-06-01: 2239
## 1st Qu.: 452694 1st Qu.: 2.000 Renewal:1780529 2017-07-01: 2169
## Median : 871909 Median : 3.000 2018-07-01: 2119
## Mean : 876050 Mean : 3.889 2017-06-01: 2114
## 3rd Qu.:1308425 3rd Qu.: 6.000 2017-09-01: 2114
## Max. :1832702 Max. :13.000 2018-08-01: 2059
## (Other) :2294051
## expirationdate policystate policyform persistency
## 2019-06-01: 2239 AZ: 165112 HO3 :1225988 Min. : 0.000
## 2018-07-01: 2167 CA:2060228 DF3 : 905154 1st Qu.: 1.000
## 2019-07-01: 2118 NV: 81525 DF6 : 79589 Median : 3.000
## 2018-06-01: 2117 Form3 : 48635 Mean : 6.204
## 2018-09-01: 2114 FL1-Vacant : 23766 3rd Qu.: 9.000
## 2019-08-01: 2053 FL3-Special: 12342 Max. :103.000
## (Other) :2294057 (Other) : 11391
## companycd carriercd
## Min. : 1.00 CSEICO: 799903
## 1st Qu.: 1.00 CSESG :1506962
## Median :17.00
## Mean :12.02
## 3rd Qu.:17.00
## Max. :19.00
##
## agency_group
## WESTERN GOLD INS AGCY INC. : 242950
## J.E. BROWN and ASSOCS INS SVCS : 93954
## CRUSBERG DECKER INS SVCS INC : 77549
## PIIB - PACIFIC INTERSTATE INS : 73917
## ISU INSURANCE SERVICES OF SAN FRANCISCO INC: 66214
## Acrisure of California : 58796
## (Other) :1693485
## producername territory
## WESTERN GOLD INS AGCY INC. : 240992 CA-B :684831
## J.E. BROWN and ASSOCS INS SVCS: 89662 CA-C :488239
## CRUSBERG DECKER INS SVCS INC : 52069 CA-O :399153
## NATIONAL INSURANCE SOLUTIONS : 31965 CA-A :347771
## BICHLMEIER INSURANCE SRVS INC : 28097 CA-T :153736
## Acrisure of California LLC : 27745 AZ-A :146637
## (Other) :1836335 (Other): 86498
## risknumber risktype yearbuilt log_yearbuilt
## Min. : 0.000 Dwelling :1032242 Min. :1900 Min. :7.523
## 1st Qu.: 1.000 Homeowners:1274623 1st Qu.:1959 1st Qu.:7.580
## Median : 1.000 Median :1979 Median :7.590
## Mean : 1.005 Mean :1976 Mean :7.588
## 3rd Qu.: 1.000 3rd Qu.:1996 3rd Qu.:7.599
## Max. :16.000 Max. :2019 Max. :7.610
##
## sqft log_sqft stories roofcd
## Min. : 800 Min. :6.397 Min. :1.000 COMPO :1143155
## 1st Qu.:1300 1st Qu.:7.182 1st Qu.:1.000 MEMBRANE: 29439
## Median :1700 Median :7.441 Median :1.000 METAL : 6126
## Mean :1872 Mean :7.471 Mean :1.183 OTHER : 270319
## 3rd Qu.:2300 3rd Qu.:7.749 3rd Qu.:1.000 TAR : 54264
## Max. :5000 Max. :9.210 Max. :3.000 TILE : 781454
## WOOD : 22108
## roofcd_encd units occupancycd occupancy_encd
## Min. :1.000 Min. :1.00 NO : 5 Min. :1.000
## 1st Qu.:7.000 1st Qu.:1.00 OCCUPIEDNOW:2139270 1st Qu.:1.000
## Median :7.000 Median :1.00 TENANT : 167590 Median :1.000
## Mean :7.216 Mean :1.12 Mean :1.073
## 3rd Qu.:8.000 3rd Qu.:1.00 3rd Qu.:1.000
## Max. :8.000 Max. :4.00 Max. :3.000
##
## allperilded waterded protectionclass constructioncd
## Min. : 0 Min. : 0.0 Min. : 0.00 AF : 824017
## 1st Qu.: 1000 1st Qu.: 0.0 1st Qu.: 2.00 B : 23379
## Median : 1000 Median : 0.0 Median : 3.00 F :1412361
## Mean : 1361 Mean : 70.1 Mean : 3.02 M : 25734
## 3rd Qu.: 2500 3rd Qu.: 0.0 3rd Qu.: 4.00 OTHER: 21374
## Max. :10000 Max. :10000.0 Max. :10.00
##
## constructioncd_encd fire_risk_model_score multipolicyind
## Min. :1.00 Min. :-1.0000 Min. :0.0000
## 1st Qu.:4.00 1st Qu.: 0.0000 1st Qu.:0.0000
## Median :5.00 Median : 0.0000 Median :0.0000
## Mean :4.55 Mean : 0.2252 Mean :0.1658
## 3rd Qu.:5.00 3rd Qu.: 0.0000 3rd Qu.:0.0000
## Max. :5.00 Max. :18.0000 Max. :1.0000
##
## multipolicyindumbrella earthquakeumbrellaind usagetype
## Min. :0.000000 Min. :0.000000 COC : 12076
## 1st Qu.:0.000000 1st Qu.:0.000000 PRIMARY :1338879
## Median :0.000000 Median :0.000000 RENTAL : 929952
## Mean :0.004352 Mean :0.003712 SEASONAL : 10109
## 3rd Qu.:0.000000 3rd Qu.:0.000000 SECONDARY : 4145
## Max. :1.000000 Max. :1.000000 UNOCCUPIED: 356
## VACANT : 11348
## usagetype_encd ordinanceorlawpct functionalreplacementcost
## Min. :1.000 Min. : 0.000 Min. :0.000000
## 1st Qu.:6.000 1st Qu.: 0.000 1st Qu.:0.000000
## Median :7.000 Median : 10.000 Median :0.000000
## Mean :6.544 Mean : 9.308 Mean :0.001543
## 3rd Qu.:7.000 3rd Qu.: 10.000 3rd Qu.:0.000000
## Max. :7.000 Max. :100.000 Max. :1.000000
##
## homegardcreditind sprinklersystem landlordind rentersinsurance
## Min. :0.000 Min. :0.00000 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000
## Median :0.000 Median :0.00000 Median :0.00000 Median :0.000000
## Mean :0.141 Mean :0.03073 Mean :0.06455 Mean :0.004432
## 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :1.000 Max. :1.00000 Max. :1.00000 Max. :1.000000
##
## firealarmtype burglaryalarmtype waterdetectiondevice
## Min. :0.0000 Min. :0.0000 Min. :0.0000000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000000
## Median :1.0000 Median :0.0000 Median :0.0000000
## Mean :0.6033 Mean :0.3378 Mean :0.0001647
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000000
## Max. :1.0000 Max. :1.0000 Max. :1.0000000
##
## neighborhoodcrimewatchind propertymanager safeguardplusind
## Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.01469 Mean :0.01574 Mean :0.3547
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000
##
## kitchenfireextinguisherind gatedcommunityind deadboltind
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :1.0000
## Mean :0.4405 Mean :0.01349 Mean :0.7197
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
##
## poolind replacementcostdwellingind replacementvalueind
## Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.03309 Mean :0.3304 Mean :0.01724
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.0000 Max. :1.00000
##
## serviceline equipmentbreakdown numberoffamilies insuredage
## Min. :0.0000 Min. :0.0000 Min. :0.00 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.00 1st Qu.: 46.00
## Median :0.0000 Median :0.0000 Median :1.00 Median : 56.00
## Mean :0.1005 Mean :0.1045 Mean :1.12 Mean : 56.19
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.00 3rd Qu.: 66.00
## Max. :1.0000 Max. :1.0000 Max. :4.00 Max. :177.00
## NA's :269594
## maritalstatus insurancescore overriddeninsurancescore
## ~ :1041080 ~ :2213757 ~ :2157007
## Divorced: 9456 99 : 495 99 : 48119
## Married : 613978 KQXKD : 155 07 : 10322
## Single : 619428 (DOES : 136 12 : 8445
## Widowed : 22923 UXKQS : 114 04 : 6887
## ZKKWS : 113 09 : 6270
## (Other): 92095 (Other): 69815
## insurancescorevalue insscoretiervalueband financialstabilitytier
## ~ :2306585 ~ :2306585 :2130960
## 825 : 13 865-880: 49 ~ : 164611
## 875 : 12 837-865: 24 07 : 1070
## 769 : 8 748-774: 23 12 : 906
## 872 : 8 894-945: 23 13 : 693
## 902 : 8 820-837: 22 04 : 686
## (Other): 231 (Other): 139 (Other): 7939
## allcov_wp cova_wp cova_ep cova_deductible
## Min. : 32.0 Min. : 5.0 Min. : 0.049 Min. : 0
## 1st Qu.: 550.0 1st Qu.: 477.0 1st Qu.: 143.359 1st Qu.: 1000
## Median : 774.0 Median : 771.0 Median : 311.837 Median : 1000
## Mean : 903.6 Mean : 988.5 Mean : 464.999 Mean : 1361
## 3rd Qu.: 1091.0 3rd Qu.: 1256.0 3rd Qu.: 605.821 3rd Qu.: 2500
## Max. :18926.0 Max. :14620.0 Max. :12121.894 Max. :10000
##
## log_cova_deductible cova_limit log_cova_limit cova_ic_nc_water
## Min. :0.000 Min. : 100000 Min. : 8.294 Min. :0.000000
## 1st Qu.:6.908 1st Qu.: 300000 1st Qu.:12.385 1st Qu.:0.000000
## Median :6.908 Median : 400000 Median :12.695 Median :0.000000
## Mean :6.994 Mean : 418954 Mean :12.687 Mean :0.006261
## 3rd Qu.:7.824 3rd Qu.: 500000 3rd Qu.:13.024 3rd Qu.:0.000000
## Max. :9.210 Max. :1300000 Max. :14.944 Max. :3.000000
##
## hasclaim cova_il_nc_water log_cova_il_nc_water
## Min. :0.00000 Min. : -5536.9 Min. :-0.10536
## 1st Qu.:0.00000 1st Qu.: 0.0 1st Qu.: 0.00000
## Median :0.00000 Median : 0.0 Median : 0.00000
## Mean :0.00613 Mean : 83.1 Mean : 0.05379
## 3rd Qu.:0.00000 3rd Qu.: 0.0 3rd Qu.: 0.00000
## Max. :1.00000 Max. :522735.2 Max. :13.16683
## NA's :3
## water_risk_3_blk log_water_risk_3_blk water_risk_fre_3_blk
## Min. : 21.0 Min. :3.045 Min. : 19.0
## 1st Qu.: 135.0 1st Qu.:4.905 1st Qu.: 108.0
## Median : 185.0 Median :5.220 Median : 154.0
## Mean : 201.5 Mean :5.199 Mean : 168.4
## 3rd Qu.: 243.0 3rd Qu.:5.493 3rd Qu.: 208.0
## Max. :1491.0 Max. :7.307 Max. :2308.0
##
## log_water_risk_fre_3_blk water_risk_sev_3_blk log_water_risk_sev_3_blk
## Min. :2.944 Min. : 33.0 Min. :3.497
## 1st Qu.:4.682 1st Qu.:106.0 1st Qu.:4.663
## Median :5.037 Median :125.0 Median :4.828
## Mean :5.011 Mean :127.7 Mean :4.821
## 3rd Qu.:5.338 3rd Qu.:147.0 3rd Qu.:4.990
## Max. :7.744 Max. :313.0 Max. :5.746
##
## appl_fail_3_blk fixture_leak_3_blk pipe_froze_3_blk plumb_leak_3_blk
## Min. :0.00 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:4.00 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000
## Median :5.00 Median :2.000 Median :2.000 Median :4.000
## Mean :4.13 Mean :1.882 Mean :1.746 Mean :3.068
## 3rd Qu.:5.00 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :5.00 Max. :5.000 Max. :5.000 Max. :5.000
##
## rep_cost_3_blk ustructure_fail_3_blk waterh_fail_3_blk
## Min. :0.000 Min. :0.000 Min. :0.00
## 1st Qu.:5.000 1st Qu.:5.000 1st Qu.:0.00
## Median :5.000 Median :5.000 Median :1.00
## Mean :4.801 Mean :4.435 Mean :1.16
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:2.00
## Max. :5.000 Max. :5.000 Max. :5.00
##
## loaddate customer_cnt_active_policies
## 2020-07-29 05:01:19.56:2306865 Min. : 1.00
## 1st Qu.: 1.00
## Median : 1.00
## Mean : 1.69
## 3rd Qu.: 1.00
## Max. :147.00
##
## customer_cnt_active_policies_binned
## Min. : 1.000
## 1st Qu.: 1.000
## Median : 1.000
## Mean : 2.806
## 3rd Qu.: 1.000
## Max. :150.000
##
dataset$cova_ic_nc_water_color <- as.factor(dataset$cova_ic_nc_water)
ggplot(dataset, aes(x = cova_ic_nc_water, fill=cova_ic_nc_water_color)) +
geom_bar() +
labs(x = 'Number of Claims', y = 'Count', title = 'Histogram of Number of Claims') +
scale_fill_manual("legend", values = c("0" = "#56B4E9", "1" = "red", "2" = "red", "3" = "red")) +
geom_text(stat='count', aes(label=..count..), vjust=1)
Since the cases when there are more then 1 claim per exposure are very rare and, I create a new logical attribute HasClaim with values 1 or 0. It can be used in logistic regression but I use it visualize if there is any visual dependency between predictors and claims.
dataset$hasclaim <- as.factor(dataset$hasclaim)
ggplot(dataset, aes(x = hasclaim, fill=hasclaim)) +
geom_bar() +
scale_fill_manual("legend", values = c("0" = "#56B4E9", "1" = "red")) +
labs(x = 'Adjusted Number of Claims', y = 'Count', title = 'Adjusted Histogram of Number of Claims') +
geom_text(stat='count', aes(label=..count..), vjust=1)
There is cery low percent of water related claims: 0.6%
(the order is from XGB Classification feature importance)
ratio <- nrow(dataset)
#grid for multiplots
multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
require(grid)
plots <- c(list(...), plotlist)
numPlots = length(plots)
if (is.null(layout)) {
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots == 1) {
print(plots[[1]])
} else {
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
#continuous attributes
plot_continuous <- function (col_name) {
p1 <- ggplot(dataset, aes(x = .data[[col_name]], fill=hasclaim)) +
geom_histogram(bins=100) +
scale_fill_manual(breaks = c("0","1"),
values=c("#56B4E9","red")) +
labs(x = col_name, y = 'Count', title = paste("Histogram of", col_name))
#p2 <- ggplot(dataset, aes(x = .data[[col_name]])) +
#geom_density() +
# labs(x = col_name, y = 'Density', title = paste("Density of", col_name))
pd_col <- pd_dataset[pd_dataset$feature == col_name,][c('value','pd')]
p2 <- ggplot(pd_col, aes(x = value, y = pd)) +
geom_line(aes(color="darkred")) +
labs(x = col_name, y = 'pd', title = paste("Claim Partial Dependency of", col_name))
p3 <- ggplot(dataset, aes(x=hasclaim, y=.data[[col_name]], col=hasclaim, fill=hasclaim)) +
geom_boxplot(notch = TRUE) +
scale_fill_manual(breaks = c("0","1"),
values=c("#56B4E9","red")) +
theme(legend.position = "none") +
labs(y = col_name, title = paste("Box Plot of", col_name, "with hasclaim"))
vec <- dataset[[col_name]]
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
p4 <- ggplot(dataset, aes(sample = .data[[col_name]], col='red')) +
stat_qq() +
geom_abline(slope = slope, intercept = int) +
theme(legend.position = "none") +
labs(y = col_name, title = paste("QQ Plot of", col_name))
multiplot(p1,p2,p3,p4, cols=2)
}
#categorical attributes - multiplot does not work for fun modeling
plot_categorical <- function(col_name) {
if ( col_name == 'roofcd_encd') {
col_name_original <- 'roofcd'
}
else if ( col_name == 'usagetype_encd' ) {
col_name_original <- 'usagetype'
}
else if ( col_name == 'constructioncd_encd' ) {
col_name_original <- 'constructioncd'
}
else if ( col_name == 'occupancy_encd' ) {
col_name_original <- 'occupancycd'
}
else {
col_name_original <- col_name
}
pd_col <- pd_dataset[pd_dataset$feature == col_name,][c('value','pd')]
colnames(pd_col) <- c(col_name,'pd')
df <- merge(dataset[c(col_name,'hasclaim')],pd_col,by=col_name)
if (grepl('encd',col_name) |
col_name =='fire_risk_model_score' |
col_name =='customer_cnt_active_policies_binned' |
col_name =='cova_deductible' |
col_name =='cova_limit' |
col_name =='protectionclass' |
col_name =='ordinanceorlawpct' |
col_name =='numberoffamilies' |
col_name =='waterded' |
col_name =='units' |
col_name =='stories') {
XBreaks <- as.vector(unlist(unique(dataset[c(col_name)])))
XLabels <- as.vector(unlist(unique(dataset[c(col_name_original)])))
}
else if (col_name =='pipe_froze_3_blk' |
col_name =='water_risk_3_blk' |
col_name =='ustructure_fail_3_blk' |
col_name =='water_risk_fre_3_blk' |
col_name =='waterh_fail_3_blk' |
col_name =='rep_cost_3_blk' |
col_name =='plumb_leak_3_blk' |
col_name =='appl_fail_3_blk' |
col_name =='fixture_leak_3_blk') {
XBreaks <- c(0,1,2,3,4,5)
XLabels <- c('Low', 'Elevated', 'Below Avg', 'Average', 'High', 'Highest')
}
else {
XBreaks <- c(0,1)
XLabels <- c("No", "Yes")
}
cols <- c('PD'='#f04546')
p1 = ggplot(df) +
geom_bar(aes(x = .data[[col_name]], fill=hasclaim)) +
scale_fill_manual(breaks = c("0","1"),
values=c("#56B4E9","red")) +
geom_line(aes(x = .data[[col_name]], y=(pd)*ratio*100,color='PD')) +
scale_y_continuous(sec.axis = sec_axis(~./ratio, name = "PD")) +
scale_x_continuous( breaks=XBreaks, labels= XLabels) +
labs(x = col_name, y = 'Count', title = paste("Histogram and claims partial dependency of", col_name_original)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
if (length(unique(dataset[[col_name_original]])) <= 6)
{p2 = bayesian_plot(data=dataset, input=col_name_original, target='hasclaim')}
p3 = cross_plot(data=dataset, str_input=col_name_original, str_target='hasclaim',plot_type='percentual')
if (length(unique(dataset[[col_name_original]])) <= 6) {
print(p1)
print(p2)
print(p3)
}
else {
print(p1)
print(p3)
}
}
plot_categorical('usagetype_encd')
## NULL
The more property is used, the higher claims rate.
plot_categorical('customer_cnt_active_policies_binned')
## NULL
If the same customer has more then 10 active policies, the claim rate is lower.
plot_continuous('ecy')
## Loading required package: grid
The longer the exposure, the higher claim rate.
plot_categorical('cova_deductible')
## NULL
The claim rate is higher in low deductible policies.
plot_continuous('yearbuilt')
More claims are in newer houses but not in modern.
plot_categorical('landlordind')
## NULL
This is a discount based on the number of policies for the same customer. It’s correlated with customer_cnt_active_policies_binned and has the same claim dependency but without details: more policies less claims rate.
plot_categorical('pipe_froze_3_blk')
## NULL
Everything indicates we have more claims in the lowest category. It may makes sense for california, where we have most insured properties in an area where low tempretures are rare but not useful for further analysis.
plot_categorical('roofcd_encd')
## NULL
Visible higher claim rate in WOOD, TILE, and maybe, TAR and OTHER
plot_continuous('sqft')
The higher sqft, the higher claim rate till some limit, where it is not increased.
plot_categorical('firealarmtype')
## NULL
Firealarmtype is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. Adding the predictor to a model increase overfitting.
plot_categorical('stories')
## NULL
Theer is a clear evidence, the 2-stories properties have more claims. However, the attributes is broken in the database. There are a lot of empty or high values (known issues).
plot_categorical('equipmentbreakdown')
## NULL
Equipmentbreakdown is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. Adding the predictor to a model increase overfitting.
plot_categorical('cova_limit')
## NULL
More claims from more expensive properties.
plot_categorical('replacementvalueind')
## NULL
Replacementvalueind is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes.
plot_categorical('propertymanager')
## NULL
Even if PropertyManager decrease number of claims according to visual analysis it is not clear from the partial dependency. Probably due to correlations.
plot_categorical('multipolicyind')
## NULL
On the one hand, there are more claims in “Yes” multipolicyind category, on the other, it’s different in the partial dependency. The predictor is not very significant in GLM
plot_categorical('poolind')
## NULL
More claims in properties with pools according to the charts and baysian comparizon but it’s different in the partial dependency. GLM results are more close to the visual and significant. The difference between “No” and “Yes” categories is very small and can be due to errors.
plot_categorical('replacementcostdwellingind')
## NULL
Replacementcostdwellingind is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. There is almost no difference between categories in the partial dependency.
plot_categorical('safeguardplusind')
## NULL
Safeguardplusind is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. There is almost no difference between categories in the partial dependency and charts.
plot_categorical('ustructure_fail_3_blk')
## NULL
There are slight increase in the “Below Average” category partial dependency but not in visual.
plot_continuous('water_risk_3_blk')
The higher the score, the more claims according to box-plots and partial dependency.
The rest of the predictors are not very important or directly related to water claims.
plot_categorical('serviceline')
## NULL
plot_categorical('protectionclass')
## NULL
plot_continuous('water_risk_fre_3_blk')
plot_categorical('rep_cost_3_blk')
## NULL
plot_categorical('waterh_fail_3_blk')
## NULL
plot_categorical('deadboltind')
## NULL
plot_categorical('homegardcreditind')
## NULL
plot_categorical('ordinanceorlawpct')
## NULL
plot_categorical('occupancy_encd')
## NULL
plot_categorical('burglaryalarmtype')
## NULL
plot_categorical('waterded')
## NULL
plot_categorical('plumb_leak_3_blk')
## NULL
plot_categorical('appl_fail_3_blk')
## NULL
plot_categorical('numberoffamilies')
## NULL
plot_categorical('units')
## NULL
plot_continuous('water_risk_sev_3_blk')
plot_categorical('multipolicyindumbrella')
## NULL
plot_categorical('kitchenfireextinguisherind')
## NULL
plot_categorical('fixture_leak_3_blk')
## NULL
plot_categorical('constructioncd_encd')
## NULL
plot_categorical('fire_risk_model_score')
## NULL
plot_categorical('gatedcommunityind')
## NULL
plot_categorical('sprinklersystem')
## NULL
plot_categorical('neighborhoodcrimewatchind')
## NULL
plot_categorical('rentersinsurance')
## NULL
plot_categorical('earthquakeumbrellaind')
## NULL
plot_categorical('functionalreplacementcost')
## NULL
ggplot(data=dataset, mapping = aes(x=yearbuilt, y=sqft, color=hasclaim)) +
geom_point() +
scale_color_manual(breaks = c("0","1"),
values=c("#56B4E9","red")) +
scale_fill_manual(breaks = c("0","1"),
values=c("#56B4E9","red")) +
labs(title = "Sqft vs YearBuilt and HasClaim")
We have more policies between 1945 and 2010 with more claims in this period.
ggplot(dataset, aes(x=roofcd, y=yearbuilt, col=hasclaim, fill=hasclaim)) +
geom_boxplot(notch = TRUE) +
#scale_color_manual(breaks = c("0","1"),
# values=c("#56B4E9","red")) +
scale_fill_manual(breaks = c("0","1"),
values=c("#56B4E9","red")) +
theme(legend.position = "none") +
labs(x = "Roof Code", title = paste("Box Plot of YearBuilt vs Roof Code"))
Tile and wood is used in more modern houses with lower rate of claims. TAR is used in older. COMPO, TAR and OTHER have more claims then other